Lesson 4


Scatterplots and Perceived Audience Size

Notes:


Scatterplots

Notes:

library(ggplot2)
setwd("~/projects/Classes/FoundationsOfDataScience_sliderule/github/UD651/L4")
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
#qplot(x= age, y= friend_count, data = pf)
qplot(age, friend_count, data = pf)


What are some things that you notice right away?

Response:


ggplot Syntax

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) +
  coord_trans(y="sqrt") +
  xlim(13,90)
## Warning: Removed 5192 rows containing missing values (geom_point).


Overplotting

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point(alpha = 1/20) +
  xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

What do you notice in the plot?

Response:


Coord_trans()

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_jitter(alpha = 1/20) +
  xlim(13,90)
## Warning: Removed 5168 rows containing missing values (geom_point).

Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot!

qplot(age, friend_count, data = pf) +
 coord_trans(ytrans="sqrt")

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point(alpha = 1/20) +
  coord_trans(y="sqrt") +
  xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00

What do you notice?


Alpha and Jitter

Notes:

ggplot(aes(x = age, y = friendships_initiated + 1), data = pf) + 
  geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) +
  coord_trans(y="log10") +
  xlim(13,90)
## Warning: Removed 5191 rows containing missing values (geom_point).


Overplotting and Domain Knowledge

Notes:


Conditional Means

Notes:

#install.packages('dplyr')
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarize(age_groups, 
        friend_count_mean = mean(friend_count),
        friend_count_median = median(friend_count),
        n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)

head(pf.fc_by_age)
## Source: local data frame [6 x 4]
## 
##     age friend_count_mean friend_count_median     n
##   (int)             (dbl)               (dbl) (int)
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196
library(dplyr)

pf.fc_by_age <- pf %.%
  group_by(age) %.%
  summarise(friend_count_mean = mean(friend_count),
        friend_count_median = median(friend_count),
        n = n()) %.%
   arrange(age)
## Warning: '%.%' is deprecated.
## Use '%>%' instead.
## See help("Deprecated")
## Warning: '%.%' is deprecated.
## Use '%>%' instead.
## See help("Deprecated")
## Warning: '%.%' is deprecated.
## Use '%>%' instead.
## See help("Deprecated")
head(pf.fc_by_age, 20)
## Source: local data frame [20 x 4]
## 
##      age friend_count_mean friend_count_median     n
##    (int)             (dbl)               (dbl) (int)
## 1     13          164.7500                74.0   484
## 2     14          251.3901               132.0  1925
## 3     15          347.6921               161.0  2618
## 4     16          351.9371               171.5  3086
## 5     17          350.3006               156.0  3283
## 6     18          331.1663               162.0  5196
## 7     19          333.6921               157.0  4391
## 8     20          283.4991               135.0  3769
## 9     21          235.9412               121.0  3671
## 10    22          211.3948               106.0  3032
## 11    23          202.8426                93.0  4404
## 12    24          185.7121                92.0  2827
## 13    25          131.0211                62.0  3641
## 14    26          144.0082                75.0  2815
## 15    27          134.1473                72.0  2240
## 16    28          125.8354                66.0  2364
## 17    29          120.8182                66.0  1936
## 18    30          115.2080                67.5  1716
## 19    31          118.4599                63.0  1694
## 20    32          114.2800                63.0  1443

Create your plot!

#ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age) +
#  geom_line()
ggplot(aes(x = age, y = friend_count_median), data = pf.fc_by_age) +
  geom_line()


Overlaying Summaries with Raw Data

Notes:

ggplot(aes(x = age, y = friend_count + 1), data = pf) + 
  coord_cartesian(xlim = c(13, 90)) +
  geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) +
  coord_trans(y="log10")

ggplot(aes(x = age, y = friend_count), data = pf) + 
  coord_cartesian(xlim = c(13, 70), ylim = c(0,1000)) +
  geom_point(alpha = 1/20, 
             position = position_jitter(h = 0),
             color = 'orange') +
  geom_line(stat = 'summary', fun.y = median) +
  geom_line(stat = 'summary', fun.y = mean, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, probs = .1,
            linetype = 2, color = 'purple') +
  geom_line(stat = 'summary', fun.y = quantile, probs = .9,
            linetype = 2, color = 'purple') 

What are some of your observations of the plot?

Response:


Moira: Histogram Summary and Scatterplot

See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.

Notes:


Correlation

Notes:

?cor.test

cor.test(pf$age, pf$friend_count,
         method=c("pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
with(pf, cor.test(age, friend_count, method="pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places. Response:

-0.027


Correlation on Subsets

Notes:

#with(                 , cor.test(age, friend_count))
with(subset(pf, age <= 70), cor.test(age, friend_count,
                                   method="pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245

Correlation Methods

Notes:

with(subset(pf, age <= 70), cor.test(age, friend_count,
                                   method="spearman"))
## Warning in cor.test.default(age, friend_count, method = "spearman"): Cannot
## compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  age and friend_count
## S = 1.5782e+14, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.2552934

Create Scatterplots

Notes:

names(pf)
##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"
#qplot(www_likes_received, www_likes_received/likes_received, data = pf)
#qplot(www_likes_received, likes_received, data = pf) +
#  coord_cartesian(xlim = c(0, 12500), ylim = c(0,50000)) 
#qplot(www_likes_received, likes_received, data = pf) +
#  coord_cartesian(xlim = c(0, 2500), ylim = c(0,20000)) 

ggplot(aes(x = www_likes_received, y = likes_received), data = pf) + 
  coord_cartesian(xlim = c(0, 300), ylim = c(0,1250)) +
  geom_point(alpha = 1/10, 
             position = position_jitter(h = 0),
             color = 'blue') +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(y = pf$www_likes_received)

***

Strong Correlations

Notes:

ggplot(aes(x = www_likes_received, y = likes_received), data = pf) + 
  coord_cartesian(xlim = c(0, quantile(pf$www_likes_received, 0.95)),
                  ylim = c(0, quantile(pf$likes_received, 0.95))) +
  geom_point(alpha = 1/10, 
             position = position_jitter(h = 0),
             color = 'blue') +
  geom_smooth(method = "lm", color = "red") +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(y = pf$www_likes_received)

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

with(pf, cor.test(www_likes_received, likes_received, 
                  method="pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

Response: cor 0.9479902

0.948


Moira on Correlation

Notes:

  • Highly correlated data (highly-coupled) may not be independent

More Caution with Correlation

Notes:

#install.packages('alr3')
library(alr3)
## Loading required package: car
data(Mitchell)
?Mitchell

Create your plot!

names(Mitchell)
## [1] "Month" "Temp"
ggplot(aes(x = Month %% 12, y = Temp), data = Mitchell) + 
  geom_point()


Noisy Scatterplots

  1. Take a guess for the correlation coefficient for the scatterplot.

  2. What is the actual correlation of the two variables? (Round to the thousandths place)

with(Mitchell, cor.test(Month, Temp))
## 
##  Pearson's product-moment correlation
## 
## data:  Month and Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Making Sense of Data

Notes:

ggplot(aes(x = Month, y = Temp), data = Mitchell) + 
  geom_point(aes(x = Mitchell$Month %% 12))

range(Mitchell$Month)
## [1]   0 203
ggplot(aes(x = Month, y = Temp), data = Mitchell) + 
  geom_point() +
  scale_x_discrete(breaks = seq(0, 203, 12))


A New Perspective

What do you notice? Response: banding and periodicity

Watch the solution video and check out the Instructor Notes! Notes:


Understanding Noise: Age to Age Months

Notes:

ggplot(aes(x = age, y = friend_count_median), data = pf.fc_by_age) +
  geom_line()

head(pf.fc_by_age)
## Source: local data frame [6 x 4]
## 
##     age friend_count_mean friend_count_median     n
##   (int)             (dbl)               (dbl) (int)
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196
pf.fc_by_age[17:19, ]
## Source: local data frame [3 x 4]
## 
##     age friend_count_mean friend_count_median     n
##   (int)             (dbl)               (dbl) (int)
## 1    29          120.8182                66.0  1936
## 2    30          115.2080                67.5  1716
## 3    31          118.4599                63.0  1694

Age with Months Means

pf$age_with_months <- (pf$age + (12 - pf$dob_month) / 12)

Programming Assignment

pf$age_with_months <- (pf$age + (12 - pf$dob_month) / 12)
pf.fc_by_age_months <- pf %.%
  group_by(age_with_months) %.%
  summarise(friend_count_mean = mean(friend_count),
        friend_count_median = median(friend_count),
        n = n()) %.%
   arrange(age_with_months)
## Warning: '%.%' is deprecated.
## Use '%>%' instead.
## See help("Deprecated")
## Warning: '%.%' is deprecated.
## Use '%>%' instead.
## See help("Deprecated")
## Warning: '%.%' is deprecated.
## Use '%>%' instead.
## See help("Deprecated")
# pf.fc_by_age_months <- pf %>%
#   group_by(age_with_months) %>%
#   summarise(friend_count_mean = mean(friend_count),
#         friend_count_median = median(friend_count),
#         n = n()) %>%
#    arrange(age_with_months)

qplot(friend_count_mean, data = pf.fc_by_age_months)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

age_with_months_groups <- group_by(pf, age_with_months)
pf.fc_by_age_months2 <- summarize(age_with_months_groups, 
        friend_count_mean = mean(friend_count),
        friend_count_median = median(friend_count),
        n = n())

pf.fc_by_age_months2 <- arrange(pf.fc_by_age_months2, age_with_months)

Noise in Conditional Means

ggplot(aes(x = age_with_months, y = friend_count_mean), 
       data = pf.fc_by_age_months) + 
  coord_cartesian(xlim = c(12, 71)) +
  geom_point(alpha = 68/100, 
             position = position_jitter(h = 0),
             color = 'blue') 

ggplot(aes(x = age_with_months, y = friend_count_mean), 
       data = pf.fc_by_age_months) + 
  coord_cartesian(xlim = c(12, 71)) +
  geom_line(color = 'blue') 

# 
ggplot(aes(x = age_with_months, y = friend_count_mean), 
       data = subset(pf.fc_by_age_months, age_with_months < 71)) + 
  geom_line(color = 'blue') 

```


Smoothing Conditional Means

Notes:

library(gridExtra)

p1 = ggplot(aes(x = age, y = friend_count_mean), 
            data = subset(pf.fc_by_age, age < 71)) +
       geom_line() + 
       geom_smooth()

p2 = ggplot(aes(x = age_with_months, y = friend_count_mean), 
            data = subset(pf.fc_by_age_months, age_with_months < 71)) + 
       geom_line() + 
       geom_smooth()
  
p3 = ggplot(aes(x = round(age / 5) * 5, y = friend_count), 
            data = subset(pf, age < 71)) + 
       geom_line(stat = 'summary', fun.y = mean)   

grid.arrange(p2, p1, p3, ncol = 1)  
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.


Which Plot to Choose?

Notes:

you don’t have to choose in EDA. exploratory!


Analyzing Two Variables

Reflection:

Lotsa graphing tools exist in R for EDA of two variables. Many more options and features seem to be available in dplyr and ggplot2 to investigate.


Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!